youth_orig <- read_csv("../../dssg-2025-mentor-canada/Data/encodedselectall.csv")
head(youth_orig)5 OHE cleaning, linear reg, and XGBoost
(with dataset cleaning in the beginning)
6 Data Type Cleaning & Unification
6.1 Load cleaned data set with dummy coding (ont-hot encoded) variables:
Outcome variables (11 expected in total):
Q12. High school GED
|
Q44. Help-seeking
|
Q13. Further education/training -> counts at each level
Q13a. Higher education (controlling for age)
|
Q45. Mental health
|
Q14. Employment status
|
Q46. Mental well-being
|
Q41. Career planning
|
Q27. Belonging
|
Q43. Social capital
|
Q50. Volunteering
|
high_school_ged <- c("QS1_19_HIGHSCHOOL", "QS1_20_HIGHSCHOOL")
further_educ <- c("QS1_21_FURTHEDUCA", "QS1_22_HIGHESTEDU", "QS1_23_YEARCOMPLE")
employment_status <- "QS1_25_EMPLOYMENT"
yearly_income <- "QS1_28_EMPLOYMENT_calculated"
career_planning <- c("QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8")
social_capital <- c("QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4")
help_seeking <- c("QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10")
mental_health <- "QS4_9_MENTALHEALTH"
well_being <- c("QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7")
belonging <- "QS4_11_BELONGING"
volunteering <- c("QS4_14_FORMALVOL", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4", "QS4_16_FORMALVOL")
youth_iv <- youth_orig |>
select(-all_of(c(high_school_ged,
further_educ,
employment_status,
yearly_income,
career_planning,
social_capital,
help_seeking,
mental_health,
well_being,
belonging,
volunteering)))
youth_dv <- youth_orig |>
select(all_of(c(high_school_ged,
further_educ,
employment_status,
yearly_income,
career_planning,
social_capital,
help_seeking,
mental_health,
well_being,
belonging,
volunteering)))6.1.0.1 Ensure all postal code are in proper upper cases:
youth_orig$geo_postcode_fsa <- toupper(youth_orig$geo_postcode_fsa)6.1.1 Remove pre-processed column (binarized) by previous researchers
youth <- youth_orig |>
select(-c(QS1_8_NEWCOMERYEAR_cat:Week_income))
youth <- youth |>
select(-c(QS2_16_FORMAT_any:total_yearly_income))6.1.2 Count proportion of empty entries per variable:
n <- youth |> nrow()
na_count_table <- youth |> summarize_all(~sum(is.na(.))) |>
pivot_longer(cols = everything(), names_to = "columns", values_to = "na_count") |>
mutate(na_percent = na_count / n * 100,
non_na_entries = n - na_count)
na_count_70_percent <- na_count_table |> filter(na_percent >= 70)
head(na_count_70_percent)# A tibble: 6 × 4
columns na_count na_percent non_na_entries
<chr> <int> <dbl> <int>
1 geo_postcode_fsa 2250 79.3 588
2 QS1_5_INDIGENOUSHS 2544 89.6 294
3 QS1_8_NEWCOMERYEAR 2421 85.3 417
4 QS1_17_INCARE 2752 97.0 86
5 QS1_26_EMPLOYMENT 2437 85.9 401
6 QS1_27_PLANNINGRE 2437 85.9 401
176 variables have >= 70% empty NA entries.
6.1.2.1 Identify highly sparse column (number of entries < 30):
cols_less_than_30_entires <- na_count_table |>
filter(non_na_entries < 30)
print(cols_less_than_30_entires, n = nrow(cols_less_than_30_entires))
cols_less_than_30_entires$columns
cols_less_than_90_percent <- na_count_table |>
filter(na_percent >= 90)
print(cols_less_than_90_percent, n = nrow(cols_less_than_90_percent))
cols_less_than_90_percent$columns6.1.3 Remove Irrelevant Columns:
youth <- youth |>
# Remove text columns:
select(-c("QS2_25_YOUTHINIT1", "QS2_29_MATCHCRITERIA_O", "QS2_39_2_Other",
"QS4_23_PASTFORMA1", "QS2_39_1_Other", "QS2_39_3_Other")) |>
# Remove columns with <30 valid entries:
select(-any_of(cols_less_than_30_entires$columns)) |>
# Remove repeated income columns:
select(-c("QS1_29_EMPLOYMENT", "QS1_30_EMPLOYMENT1", "QS1_30_EMPLOYMENT2",
"QS1_31_EMPLOYMENT", "QS1_32_WEEKLY")) |>
# These columns were selected out in other files, so any_of() is used:
select(-any_of(c("QS2_15_RELATIONSHIP1", "QS2_5_Other", "QS2_17_TYPE_1_Other"))) |>
# Take out postal code--it will be useful only when building geospatial plot:
select(-geo_postcode_fsa) |>
# Text column: Name of organization with with respondent served as a mentor
select(-c("QS4_19_CURRENTME1", "QS4_19_CURRENTME2", "QS4_19_CURRENTME3")) |>
# Text column: name of mentor organization, else unsure (QS4_23_PASTFORMA2) or prefer not say (QS4_23_PASTFORMA3):
select(-c('QS4_23_PASTFORMA2', 'QS4_23_PASTFORMA3')) |>
# This is an 'other' column for text written response:
select(-any_of(c("QS2_34_SUPPORTS1_10_10"))) |>
# text entry:
select(-any_of(c("QS2_26_INITIATIONEV"))) |>
# whether respondent remembered name of organization:
select(-any_of(c("QS2_27_MENTORPROGRA1")))6.1.3.1 Remove Prefer not to answer / Don't know responses in the following ordinal items:
QS4_9_MENTALHEALTH"Don’t know""Prefer not to answer"
QS4_11_BELONGING"Don’t know""Prefer not to answer"
QS4_20_MENTEEAGE"Unsure""Prefer not to say"
nrow(youth) # 2838
youth <- youth |>
filter(!QS4_9_MENTALHEALTH %in% c("Don’t know", "Prefer not to answer"), # ordinal
!QS4_11_BELONGING %in% c("Don’t know", "Prefer not to answer"), # ordinal
!QS4_20_MENTEEAGE %in% c("Unsure", "Prefer not to say")) # ordinal
nrow(youth) # 2396# Responses containing "Unsure"
unsure_counts <- tidy(colSums(youth == "Unsure", na.rm = TRUE))|>
mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
filter(`is_unsure` == "TRUE")
# Responses containing "Prefer not to say"
pns_counts <- tidy(colSums(youth == "Prefer not to say", na.rm = TRUE))|>
mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
filter(`is_unsure`== "TRUE")head(unsure_counts)# A tibble: 6 × 3
names x is_unsure
<chr> <dbl> <lgl>
1 QS1_2_PROV 6 TRUE
2 QS1_4_INDIGENOUS 101 TRUE
3 QS1_12_DISABIL 133 TRUE
4 QS1_13_DISABIL 43 TRUE
5 QS1_22_HIGHESTEDU 110 TRUE
6 QS2_1_MEANINGFULP 254 TRUE
head(pns_counts)# A tibble: 6 × 3
names x is_unsure
<chr> <dbl> <lgl>
1 QS1_2_PROV 14 TRUE
2 QS1_4_INDIGENOUS 75 TRUE
3 QS1_11_SEXUALO 91 TRUE
4 QS1_12_DISABIL 40 TRUE
5 QS1_13_DISABIL 11 TRUE
6 QS2_1_MEANINGFULP 29 TRUE
- Career Planning Variables contain a huge number of ‘Unsure’ responses:
It would be more appropriate to return to this variable later and remove these observations.
To work with the rest of the data, it may serve the analysis better if
QS4_3_CAREERPLANNIN1_1_1toQS4_3_CAREERPLANNIN1_8_8will be removed.
QS4_3_CAREERPLANNIN1_1_1: 486
UnsureresponsesQS4_3_CAREERPLANNIN1_2_2: 413
QS4_3_CAREERPLANNIN1_3_3: 508
QS4_3_CAREERPLANNIN1_4_4: 463
QS4_3_CAREERPLANNIN1_5_5: 591
QS4_3_CAREERPLANNIN1_6_6: 354
QS4_3_CAREERPLANNIN1_7_7: 512
QS4_3_CAREERPLANNIN1_8_8: 717
- Other variables containing
Unsureresponses:
- Given the number of
unsureresponses in these variables are small relative to the number of total observation (~2,900), these their corresponding observation would be remove. The resultant dimension of the train ***** data will be examined.
6.2 Identify continuous metrics columns:
QS1_14_DISABIL (9b: At what age did disability / conditions emerge.)
QS1_15_DISABIL (9b1: How many years have the disability affected performance)
Education levels (years) being treated as continuous due to > 7 levels.
QS1_18_PARENTEDUC1
QS1_18_PARENTEDUC2
QS1_20_HIGHSCHOOL
QS1_23_YEARCOMPLE (In what year completed higher education)
Employment wage / salary:
QS1_29_EMPLOYMENT (removed)
QS1_30_EMPLOYMENT1 (removed)
QS1_30_EMPLOYMENT2 (removed)
QS1_31_EMPLOYMENT (removed)
QS1_32_WEEKLY (removed)
Month_income
Semimonth_income
Biweek_income
Week_income
QS1_28_EMPLOYMENT_calculated
Volunteer hours:
QS4_15_TIMEIFFOR1 (Total hour)
QS4_15_TIMEIFFOR2 (Hours per month)
QS4_15_TIMEIFFOR3 (Hours per week)
QS4_15_TIMEIFFOR4 (Hours per day)
Number of mentors:
- QS2_10_NUMBEROFME
# metric variables, not including outcome `QS1_25_EMPLOYMENT`
metric_vars <- c("QS1_1_AGE", "QS1_8_NEWCOMERYEAR", "QS1_14_DISABIL", "QS1_15_DISABIL", "QS1_20_HIGHSCHOOL", "QS1_23_YEARCOMPLE", "QS2_10_NUMBEROFME", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4", "QS4_12_TRUST1_1_1", "QS4_12_TRUST1_2_2", "QS4_12_TRUST1_3_3", "QS4_12_TRUST1_4_4", "QS4_12_TRUST1_5_5", "QS4_14_FORMALVOL") 6.3 Convert all categorical columns into factor data type
- Identify categorical columns:
cate_vars <- c("QS1_2_PROV", "QS1_3_COMMUNITYTYPE", "QS1_4_INDIGENOUS",
"QS1_5_INDIGENOUSHS",
"QS1_7_NEWCOMER", "QS1_10_TRANSUM", "QS1_11_SEXUALO", "QS1_12_DISABIL", "QS1_13_DISABIL",
"QS1_17_INCARE",
"QS1_19_HIGHSCHOOL", "QS1_21_FURTHEDUCA", "QS1_25_EMPLOYMENT",
"QS1_27_PLANNINGRE",
"QS1_28_EMPLOYMENT", "QS2_1_MEANINGFULP", "QS2_2_MEANINGFULP", "QS2_3_PRESENCEOFM", "QS2_4_MENTOR61FOR",
"QS2_5_MENTOR611PR",
"QS2_7_MENTOR611SE", "QS2_8_UNMETNEED61", "QS2_9_PRESENCEOFA", "QS2_11_MENTOR1218", "QS2_12_UNMETNEED1", "QS2_16_FORMAT_1", "QS2_17_TYPE_1", "QS2_18_LOCATION_1",
"QS2_22_GEOLOCATI1",
"QS2_16_FORMAT_2", "QS2_17_TYPE_2", "QS2_18_LOCATION_2",
"QS2_22_GEOLOCATI2",
"QS2_16_FORMAT_3", "QS2_17_TYPE_3", "QS2_18_LOCATION_3",
"QS2_22_GEOLOCATI3",
"QS2_23_MOSTMEANI", "QS2_24_MENTORAGE",
# "QS2_26_INITIATIONEV", # *Removed due to having 99.7% NA entries.
# "QS2_27_MENTORPROGRA1", # Removed because it asks for 'recall of the name of mentor organization'
"QS2_28_MATCHCHOICE", "Match_GenderIdentity", "Match_Ethnicity", "Match_CulturalBackground", "Match_ScheduleAvailability", "Match_Interests", "Match_Goals", "Match_Personalities", "Match_LifeStruggles", "Transition_School", "Transition_NewSchool", "Transition_NewCommunity", "Transition_GettingDriversLicense", "Transition_JobAspirations", "Transition_GettingFirstJob", "Transition_ApplyingToTradeSchool-Collge-Uni", "Transition_IndependenceFromGuardian", "Transition_FundingForTradeSchool-Collge-Uni", "Transition_NoneOfAbove", "QS2_33_TRANSITIONS1_13_13", "QS2_33_TRANSITIONS1_14_14", "QS3_4_LIFEEVENTS1_1_1", "QS3_4_LIFEEVENTS1_2_2", "QS3_4_LIFEEVENTS1_3_3", "QS3_4_LIFEEVENTS1_4_4", "QS3_4_LIFEEVENTS1_5_5", "QS3_4_LIFEEVENTS1_6_6", "QS3_4_LIFEEVENTS1_7_7", "QS3_4_LIFEEVENTS1_8_8", "QS3_4_LIFEEVENTS1_9_9", "QS3_4_LIFEEVENTS1_10_10", "QS3_4_LIFEEVENTS1_11_11", "QS3_4_LIFEEVENTS1_12_12", "QS3_4_LIFEEVENTS1_13_13", "QS3_4_LIFEEVENTS1_14_14", "QS3_4_LIFEEVENTS1_15_15", "QS3_4_LIFEEVENTS1_16_16", "QS3_4_LIFEEVENTS1_17_17", "QS3_4_LIFEEVENTS1_18_18", "QS3_4_LIFEEVENTS1_19_19", "QS3_4_LIFEEVENTS1_20_20", "QS4_1_MEANINGFULPERSON", "QS4_4_EDUCATIONALEXPEC", "QS4_6_DISAPPOINTED", "QS4_13_LIFEEVE1_1_1", "QS4_13_LIFEEVE1_2_2", "QS4_13_LIFEEVE1_3_3", "QS4_13_LIFEEVE1_4_4", "QS4_13_LIFEEVE1_5_5", "QS4_13_LIFEEVE1_6_6", "QS4_16_FORMALVOL", "QS4_17_SERVEDASM", "QS4_18_CURRENTOR",
"QS4_21_MENTORING", "QS4_22_PASTMENTO", "QS4_25_FUTUREMEN",
"QS4_26_INTERNETC", "QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_28_INTERNETCON", "QS4_29_PRIVATECONN", "QS4_30_INTERNETCON", "QS4_31_MOBILECONNE", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4", "QS4_33_MOBILECONNECT"
)youth <- youth |>
mutate(across(all_of(cate_vars), as_factor))6.4 Identify Likert scale items columns (ordinal)
ordinal_numeric_vars <- c("QS1_18_PARENTEDUC1", "QS1_18_PARENTEDUC2", "QS2_19_DURATION_1", "QS2_20_EXPERIENCE_1", "QS2_19_DURATION_2", "QS2_20_EXPERIENCE_2", "QS2_19_DURATION_3", "QS2_20_EXPERIENCE_3", "QS2_30_MATCHSIMILAR1_1_1", "QS2_30_MATCHSIMILAR1_2_2", "QS2_30_MATCHSIMILAR1_3_3", "QS2_30_MATCHSIMILAR1_4_4", "QS2_30_MATCHSIMILAR1_5_5", "QS2_31_MENTORINGREL1_1_1", "QS2_31_MENTORINGREL1_2_2", "QS2_31_MENTORINGREL1_3_3", "QS2_31_MENTORINGREL1_4_4", "QS2_31_MENTORINGREL1_5_5", "QS2_32_MENTORINGENG1_1_1", "QS2_32_MENTORINGENG1_2_2", "QS2_32_MENTORINGENG1_3_3", "QS2_32_MENTORINGENG1_4_4", "QS2_32_MENTORINGENG1_5_5", "QS2_32_MENTORINGENG1_6_6", "QS2_32_MENTORINGENG1_7_7", "QS2_32_MENTORINGENG1_8_8", "QS2_32_MENTORINGENG1_9_9", "QS2_32_MENTORINGENG1_10_10", "QS2_32_MENTORINGENG1_11_11", "QS2_32_MENTORINGENG1_12_12", "QS2_32_MENTORINGENG1_13_13", "QS2_32_MENTORINGENG1_14_14", "QS2_32_MENTORINGENG1_15_15", "QS2_32_MENTORINGENG1_16_16", "QS2_32_MENTORINGENG1_17_17", "QS2_32_MENTORINGENG1_18_18", "QS2_32_MENTORINGENG1_19_19", "QS2_32_MENTORINGENG1_20_20", "QS2_32_MENTORINGENG1_21_21", "QS2_32_MENTORINGENG1_22_22", "QS2_35_SUPPORTSIMPO1_1_1", "QS2_35_SUPPORTSIMPO1_2_2", "QS2_35_SUPPORTSIMPO1_3_3", "QS2_35_SUPPORTSIMPO1_4_4", "QS2_35_SUPPORTSIMPO1_5_5", "QS2_35_SUPPORTSIMPO1_6_6", "QS2_35_SUPPORTSIMPO1_7_7", "QS2_35_SUPPORTSIMPO1_8_8", "QS2_35_SUPPORTSIMPO1_9_9", "QS2_35_SUPPORTSIMPO1_10_10", "QS2_37_HELPFULNESS",
# "QS_40_REMATCHING_2", "QS_40_REMATCHING_3",
"QS3_1_GLOBALSELFWOR1_1_1", "QS3_1_GLOBALSELFWOR1_2_2", "QS3_1_GLOBALSELFWOR1_3_3", "QS3_1_GLOBALSELFWOR1_4_4", "QS3_1_GLOBALSELFWOR1_5_5", "QS3_1_GLOBALSELFWOR1_6_6", "QS3_1_GLOBALSELFWOR1_7_7", "QS3_1_GLOBALSELFWOR1_8_8", "QS3_5_SCHOOLCLIMATE1_1_1", "QS3_5_SCHOOLCLIMATE1_2_2", "QS3_5_SCHOOLCLIMATE1_3_3", "QS3_5_SCHOOLCLIMATE1_4_4", "QS3_5_SCHOOLCLIMATE1_5_5", "QS3_5_SCHOOLCLIMATE1_6_6", "QS3_5_SCHOOLCLIMATE1_7_7", "QS3_5_SCHOOLCLIMATE1_8_8", "QS3_5_SCHOOLCLIMATE1_9_9", "QS3_5_SCHOOLCLIMATE1_10_10", "QS4_2_MEANINGFULPERSON", "QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10", "QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7"
)
ordinal_chr_vars <- c("QS1_22_HIGHESTEDU", "QS2_6_MENTOREXPER", "QS2_34_SUPPORTS1_1_1", "QS2_34_SUPPORTS1_2_2", "QS2_34_SUPPORTS1_3_3", "QS2_34_SUPPORTS1_4_4", "QS2_34_SUPPORTS1_5_5", "QS2_34_SUPPORTS1_6_6", "QS2_34_SUPPORTS1_7_7", "QS2_34_SUPPORTS1_8_8", "QS2_34_SUPPORTS1_9_9", "QS2_36_INFLUENCE1_1_1", "QS2_36_INFLUENCE1_2_2", "QS2_36_INFLUENCE1_3_3", "QS2_36_INFLUENCE1_4_4", "QS2_36_INFLUENCE1_5_5", "QS2_36_INFLUENCE1_6_6", "QS2_36_INFLUENCE1_7_7", "QS2_36_INFLUENCE1_8_8", "QS2_36_INFLUENCE1_9_9", "QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8", "QS4_5_SATEDU", "QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4","QS4_9_MENTALHEALTH", "QS4_11_BELONGING","QS4_20_MENTEEAGE", "QS4_24_FUTUREMEN")6.4.1 Handelling Varied Likert Scale Items: Label Alignment Issue
6.4.1.1 Task: Reverse coding for raw ordinal columns with numeric encoding:
For some ordinal columns (numerically encoded ones), there is incongruency in their valance direction. The issue at hand is to ensure higher numeric values will consistently represent the ‘positive’ or ‘high’ end of the scale (e.g., agreement, positivity, truth) across all variables:
Question 11:
QS1_18_PARENTEDUC*Question 20f:
QS2_19_DURATION_*- <3 months, 3-6 months, 7-11 months, 12 months or more, It’s still ongoing
Question 20g:
QS2_19_EXPERIENCE_*- Always positive; Mostly positive; Neutral; Mostly negative; Always negative
Question 26:
QS2_30_MATCHSIMILAR1_*_*- (1) Disagree (2) Neither agree nor disagree (3) Agree
Question 27:
QS2_31_MENTORINGREL1_*_*- 1. Strongly disagree 2. Disagree 3. Neither agree nor disagree 4. Agree 5. Strongly agree
Question 28:
QS2_32_MENTORINGENG1_*_*- 1.Not very true—2.Sometimes true—3.Very true
Question 30:
QS2_35_SUPPORTSIMPO1_6_6- Not very true—Sometimes true—Very true
Question 36:
QS3_1_GLOBALSELFWOR1_2_2- (1) Strongly disagree (2) Disagree (3) Agree (4) Strongly agree
Question 39:
QS3_5_SCHOOLCLIMATE1_*_*- Strongly Agree——- A——-D——–Strongly Disagree
Question44:
QS4_8_HELPSEEKING1_*_*- (1) Not at all true (2) (3) (4) (5) Completely true
Question 46
QS4_10_MENTALWELLBE1_*_*:- (1) None of the time — (2) Rarely — (3) Some of the time — (4) Often — (5) All of the time
Steps: QS2_19_EXPERIENCE_(1-5 scale: Always positive to Always negative) and QS3_5_SCHOOLCLIMATE1_ (1-4 scale: Strongly Agree to Strongly Disagree) require reverse coding.
reversed_coding <- youth |>
mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - ., .names = "reversed_{.col}"),
# Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - ., .names = "reversed_{.col}"))# Validate reversed coding:
reversed_coding |>
select(starts_with("QS2_19_EXPERIENCE_"), starts_with("reversed_QS2_19_EXPERIENCE_"),
starts_with("QS3_5_SCHOOLCLIMATE1_"), starts_with("reversed_QS3_5_SCHOOLCLIMATE1_")) %>%
head()# A tibble: 6 × 20
QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
<dbl> <dbl> <dbl>
1 4 4 4
2 1 2 2
3 1 1 1
4 3 2 2
5 3 3 3
6 2 2 3
# ℹ 17 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
# QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
# QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
# QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_1_1 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_2_2 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_3_3 <dbl>, …
reversed_codingshows thatreversed_QS2_19_EXPERIENCE_andreversed_QS3_5_SCHOOLCLIMATE1_are successful reverse encoding ofQS2_19_EXPERIENCE_andQS3_5_SCHOOLCLIMATE1_!Now, let’s apply the this reverse encoding onto the original
youthdata set, replacing theQS2_19_EXPERIENCE_andQS3_5_SCHOOLCLIMATE1_directly in the original data frame:
youth <- youth |>
mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - .),
# Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - .))# Once again, validate result:
youth |>
select(starts_with("QS2_19_EXPERIENCE_"), starts_with("QS3_5_SCHOOLCLIMATE1_")) |>
head()# A tibble: 6 × 10
QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
<dbl> <dbl> <dbl>
1 1 1 1
2 4 3 3
3 4 4 4
4 2 3 3
5 2 2 2
6 3 3 2
# ℹ 7 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
# QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
# QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
# QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>
6.4.2 Convert all string/character Likert scale items columns into ordinal data type:
youth <- youth |>
mutate(across(all_of(ordinal_numeric_vars), as_factor)) # converts ordinal items encoded in either (1) numbers or (2) strings into factor.
youth <- youth |>
mutate(across(all_of(ordinal_chr_vars), as_factor)) 6.4.3 Convert string/character Likert scale level columns into ordered numeric:
- Specify string/character-labeled Likert levels:
experience_ord <- list(levels = 1:5, labels = c("Always negative", "Mostly negative", "Somewhat positive", "Mostly positive", "Always positive"))
support_ord <- list(levels = 1:3, labels = c("Not very true","Sometimes true","Very true"))
influence_ord <- list(levels = 1:4, labels = c("A little","Some","Quite a bit","A lot")) # No 'None' in response range
career_plan_ord <- list(levels = 1:7, labels = c("Completely disagree","Disagree","Somewhat disagree","Unsure","Somewhat agree","Agree","Completely agree"))
social_capital_ord <- list(levels = 1:5, labels = c("Strongly disagree","Disagree","Neutral","Agree","Strongly Agree"))
mental_health_ord <- list(levels = 1:5, labels = c("Poor","Fair","Good","Very good", "Excellent"))
# "Don't know" / 'Prefer not to answer' is not included (previously filtered)
belonging_ord <- list(levels = 1:4, labels = c("Very weak","Somewhat weak","Somewhat strong","Very strong"))
mentee_age_ord <- list(levels = 1:4, labels = c("Under 18 years old","18-25 years old","26-29 years old","30 years old or older"))
mentoring_interest_ord <- list(levels = 1:4, labels = c("Not interested at all", "Not that interested","Fairly interested","Very interested"))
high_edu_ord <- list(levels = 1:11, labels = c("Certificate of Apprenticeship or Certificate of Qualification",
"Other trades certificate or diploma",
"College, CEGEP or other non-university certificate or diploma",
"University diploma or certificate BELOW Bachelor’s Degree",
"Bachelor’s degree (e.g. BA, BSc, B.Ed., B.Eng including LL.B-law degree)",
"University diploma or certificate ABOVE Bachelor’s Degree",
"First professional degree (e.g. medicine, dentistry, veterinary medicine, optometry)",
"Master's degree (e.g. M.B.A., M.A., M.Sc.)",
"Doctorate degree (earned e.g. Ph.D, D.Sc., D.Ed) or post-doctoral program",
"Other, please specify",
"Unsure"))- Compile all character-labeled Likert items in a list (i.e., a list of list in this case).
lst_of_ord_cols <- list(QS2_6_MENTOREXPER = experience_ord,
QS2_34_SUPPORTS1_1_1 = support_ord,
QS2_34_SUPPORTS1_2_2 = support_ord,
QS2_34_SUPPORTS1_3_3 = support_ord,
QS2_34_SUPPORTS1_4_4 = support_ord,
QS2_34_SUPPORTS1_5_5 = support_ord,
QS2_34_SUPPORTS1_6_6 = support_ord,
QS2_34_SUPPORTS1_7_7 = support_ord,
QS2_34_SUPPORTS1_8_8 = support_ord,
QS2_34_SUPPORTS1_9_9 = support_ord,
QS2_36_INFLUENCE1_1_1 = influence_ord,
QS2_36_INFLUENCE1_2_2 = influence_ord,
QS2_36_INFLUENCE1_3_3 = influence_ord,
QS2_36_INFLUENCE1_4_4 = influence_ord,
QS2_36_INFLUENCE1_5_5 = influence_ord,
QS2_36_INFLUENCE1_6_6 = influence_ord,
QS2_36_INFLUENCE1_7_7 = influence_ord,
QS2_36_INFLUENCE1_8_8 = influence_ord,
QS2_36_INFLUENCE1_9_9 = influence_ord,
QS4_3_CAREERPLANNIN1_1_1 = career_plan_ord,
QS4_3_CAREERPLANNIN1_2_2 = career_plan_ord,
QS4_3_CAREERPLANNIN1_3_3 = career_plan_ord,
QS4_3_CAREERPLANNIN1_4_4 = career_plan_ord,
QS4_3_CAREERPLANNIN1_5_5 = career_plan_ord,
QS4_3_CAREERPLANNIN1_6_6 = career_plan_ord,
QS4_3_CAREERPLANNIN1_7_7 = career_plan_ord,
QS4_3_CAREERPLANNIN1_8_8 = career_plan_ord,
QS4_7_SOCIALCAPITAL1_1_1 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_2_2 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_3_3 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_4_4 = social_capital_ord,
QS4_9_MENTALHEALTH = mental_health_ord,
QS4_11_BELONGING = belonging_ord,
QS4_20_MENTEEAGE = mentee_age_ord,
QS4_24_FUTUREMEN = mentoring_interest_ord,
QS1_22_HIGHESTEDU = high_edu_ord,
QS4_5_SATEDU = high_edu_ord)- Write a function to convert all character labels to the correct ordinal numeric scores!
6.4.3.0.1 Function 1: For converting string type ordinal leveled columns into well-ordered integers:
convert_string_ord_fac <- function(data, lst_ord_level_cols, lst_col_names) {
data <- data |>
mutate(across(.cols = all_of(lst_col_names),
.fns = ~ {
col_name <- cur_column()
scale <- lst_ord_level_cols[[col_name]]
factor(.x,
levels = scale$labels, # the string lebels are the 'levels'
labels = scale$levels, # now we want the `labels` to be numeric, according to the levels.
ordered = TRUE)}))
return(data)}Apply above function
convert_ord_fac()
youth |> select(all_of(c(ordinal_chr_vars))) |> str() # before conversion
youth <- convert_string_ord_fac(youth, lst_of_ord_cols, ordinal_chr_vars)
# youth |> select(all_of(c(ordinal_chr_vars))) |> str() # after conversion
# Validated! Success!6.4.3.0.2 Function 2: For converting numeric type ordinal leveled columns into ordered factor
convert_number_ord_fac <- function(data, lst_col_names) {
data[lst_col_names] <- lapply(data[lst_col_names], function(x) {
factor(x, levels = sort(unique(x)), ordered = TRUE)
})
return(data)}Apply above Function 2 to
ordinal_numeric_vars
youth |> select(all_of(c(ordinal_numeric_vars))) |> str() # before conversion
youth <- convert_number_ord_fac(youth, ordinal_numeric_vars)
# youth |> select(all_of(c(ordinal_numeric_vars))) |> str() # after conversion
# Validated! Another success!6.4.4 Unsure response entries in ordinal columns:
6.4.4.1 QS4_3_CAREERPLANNIN1_1_1 to QS4_3_CAREERPLANNIN1_8_8: The only ordinal column with Unsure entries
- The following table examines number of
Unsureentries here: 40 “Unsure” responses:
# Summary number of `Unsure` responses for QS4_3_CAREERPLANNIN:
bind_cols(list(count_career_1, count_career_2, count_career_3, count_career_4, count_career_5, count_career_6, count_career_7, count_career_8))# A tibble: 1 × 8
count_career_1 count_career_2 count_career_3 count_career_4 count_career_5
<int> <int> <int> <int> <int>
1 342 308 380 343 442
# ℹ 3 more variables: count_career_6 <int>, count_career_7 <int>,
# count_career_8 <int>
Decision: We reason that since Unsure in the context of this question serve similar to a ‘undecided’ stance between the two Likert end, we would retain Unsure for this ordinal item.
6.4.5 Unsure and Other, please specify entries in QS4_4_EDUCATIONALEXPEC
Other, please specify= 10Unsure= 11
cat_unsure_pnts_10 <- c("QS1_22_HIGHESTEDU", "QS4_4_EDUCATIONALEXPEC") # 10 = Other, 11 = Unsure
youth |>
select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS1_22_HIGHESTEDU > 10) |>
summarize(count_high_edu_na = n()) # A tibble: 1 × 1
count_high_edu_na
<int>
1 110
A total of 110 respondents who had answered either
UnsureorOther, please specifyfor question 13a (QS1_22_HIGHESTEDU).
youth |>
select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS4_4_EDUCATIONALEXPEC >= 10) |>
summarize(count_high_edu_na = n())# A tibble: 1 × 1
count_high_edu_na
<int>
1 91
A total of 91 respondents who had answered either
UnsureorOther, please specifyfor question 42 (QS4_4_EDUCATIONALEXPEC).Decision: We choose retain
Unsurefor this ordinal item.
6.4.6 Remove *_PreferNotToSay, *_Unsure, or *_Other OHE’d categorical variables directly from raw data:
Downstream Plan 1: These
*_PreferNotToSay,*_Unsure, and*_Otherending variables are already in the raw dataframe. The subsequent one-hot encoding we’ll perform on other nominal variables that had not been properly OHE’d will go through the same removal once again.Downstream Plan 2: There are binary (
YesorNo) responses to some of the categorical columns as well. After the downstream OHE, theNocolumn corresponding to those binary columns will be removed.
youth |>
select((ends_with(c("_PreferNotToSay", "_Unsure", "_Other")))) |>
ncol() [1] 21
# Removal:
youth <- youth |>
select(-(ends_with(c("_PreferNotToSay", "_Unsure", "_Other"))))There are 21 columns of
*_PreferNotToSay,*_Unsure, or*_Othercategorical variables, and these columns have been removed.
7 Data train-test split
split <- initial_split(youth, prop = .80, strata = QS1_28_EMPLOYMENT_calculated)
train_data <- training(split)
test_data <- testing(split)7.0.1 Impute missing (NA) entries in ordinal columns:
- Imputation should be done after train/test data splitting to prevent test data leakage into the train set!
preprocessing_recipe <- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
# step_impute_knn(all_of(c(all_nominal_predictors(),
# all_numeric_predictors())),
# -all_of(metric_vars), neighbors = 5) |> # KNN Imputation is too computational time consuming for this large dataset without special handling-- we pass it just for now.
step_impute_mode(all_ordered_predictors()) |>
step_mutate_at( all_ordered_predictors(), fn = as.numeric) |>
step_impute_mode(all_nominal_predictors()) |>
# cate_vars are those vars that were categorical but not ohe.
step_dummy(any_of(all_nominal_predictors()), one_hot = TRUE) |>
# Standardize continuous variables:
step_center(all_of(metric_vars)) |>
step_scale(all_of(metric_vars)) |>
# Mean impute continuous variables:
step_impute_mean(all_of(metric_vars)) |>
step_impute_median(all_numeric_predictors(), -all_of(metric_vars)) |>
# Impute income output with its median
step_impute_median(all_outcomes()) Comment:
- Code viewers can run the following code snippet in the console to see concise count of each ordinal level! :
mean_mode_imputed_train |> select(all_of(c(ordinal_numeric_vars, ordinal_chr_vars))) |> summary() mean_mode_imputed_train |> select(all_of(ordinal_chr_vars)) |> summary()
- Continuous variables are standardized.
mean_mode_impute_prep <- preprocessing_recipe |>
prep()
mean_mode_imputed_train <- bake(mean_mode_impute_prep, new_data = NULL)
mean_mode_imputed_test <- bake(mean_mode_impute_prep, new_data = test_data)7.0.2 Only one-hot encode the nominal columns without imputation:
- Continuous variables are standardized.
preprocessing_recipe_ohe_unimputed <- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
step_dummy(all_of(cate_vars), one_hot = TRUE) |>
step_center(all_of(metric_vars)) |>
step_scale(all_of(metric_vars))ohe_unimpute_prep <- preprocessing_recipe_ohe_unimputed |>
prep()
ohe_unimputed_train <- bake(ohe_unimpute_prep, new_data = NULL)
ohe_unimputed_test <- bake(ohe_unimpute_prep, new_data = test_data)Validate post-ohe, column names are the same between mean_mode_imputed_train and
ohe_unimputed_train:
identical(ncol(ohe_unimputed_train), ncol(mean_mode_imputed_train))# Outputs TRUE! They are the same![1] TRUE
identical(nrow(ohe_unimputed_train), nrow(mean_mode_imputed_train))# Outputs TRUE! They are the same![1] TRUE
nrow(ohe_unimputed_train)[1] 1915
nrow(mean_mode_imputed_train)[1] 1915
nrow(youth)[1] 2396
ncol(mean_mode_imputed_train)[1] 699
ncol(ohe_unimputed_train)[1] 699
ncol(youth)[1] 401
7.1 Examine new one-hot encoded columns:
Task 1: Let’s look at how many new one-hot encoded columns contain *_PreferNotToSay, *_Unsure, and *_Other endings, which need to be removed.
Task 2: Also look at how many new ohe’d columns contain the ending of: *_No and numeric encoding for *_PreferNotToSay, *_Unsure, and *_Other.
7.1.0.1 Remove *_PreferNotToSay, *_Prefer.not.to.answer, *_Unsure, and *_Other:
mean_mode_imputed_train |>
select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()[1] 43
ohe_unimputed_train |>
select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()[1] 43
Both imputed and unimputed OHE’d train datasets output 24, meaning there are 24 columns of “*_PreferNotToSay”, “_Unsure”, “_Other”.
# Removals (train):
mean_mode_imputed_train <- mean_mode_imputed_train |>
select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
ohe_unimputed_train <- ohe_unimputed_train |>
select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))# Removals (test):
mean_mode_imputed_test <- mean_mode_imputed_test |>
select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
ohe_unimputed_test <- ohe_unimputed_test |>
select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))7.1.0.2 Remove *_No
mean_mode_imputed_train |>
select((contains(c("_No")))) |>
ncol() [1] 47
mean_mode_imputed_train |>
select((ends_with(c("_No")))) |>
ncol() [1] 38
ohe_unimputed_train |>
select((ends_with(c("_No")))) |>
ncol() [1] 38
Both imputed and unimputed OHE’d train datasets output 38, meaning there are 38 columns of “*_No”.
# Removals (train):
mean_mode_imputed_train <- mean_mode_imputed_train |>
select(-(contains(c("_No"))))
ohe_unimputed_train <- ohe_unimputed_train |>
select(-(contains(c("_No"))))# Removals (test):
mean_mode_imputed_test <- mean_mode_imputed_test |>
select(-(contains(c("_No"))))
ohe_unimputed_test <- ohe_unimputed_test |>
select(-(contains(c("_No"))))7.1.0.3 Remove numerically encoded *_PreferNotToSay, *_Unsure, *_Other, and *_No:
7.1.0.4
_PreferNotToSay: For convenience, we sometimes use ‘pnts’ short for Prefer Not to Say.
# >=2 is no/unsure/pnts:
cat_unsure_pnts_2_or_more <- c("QS1_5_INDIGENOUSHS", "QS1_17_INCARE", "QS1_27_PLANNINGRE",
"QS2_5_MENTOR611PR", "QS2_7_MENTOR611SE", "QS2_11_MENTOR1218",
'QS2_22_GEOLOCATI1', 'QS2_22_GEOLOCATI2', 'QS2_22_GEOLOCATI3',
'QS3_4_LIFEEVENTS1_1_1', 'QS3_4_LIFEEVENTS1_2_2', 'QS3_4_LIFEEVENTS1_3_3',
'QS3_4_LIFEEVENTS1_4_4', 'QS3_4_LIFEEVENTS1_5_5', 'QS3_4_LIFEEVENTS1_6_6',
'QS3_4_LIFEEVENTS1_7_7', 'QS3_4_LIFEEVENTS1_8_8', 'QS3_4_LIFEEVENTS1_9_9',
'QS3_4_LIFEEVENTS1_10_10', 'QS3_4_LIFEEVENTS1_11_11', 'QS3_4_LIFEEVENTS1_12_12',
'QS3_4_LIFEEVENTS1_13_13', 'QS3_4_LIFEEVENTS1_14_14', 'QS3_4_LIFEEVENTS1_15_15',
'QS3_4_LIFEEVENTS1_16_16', 'QS3_4_LIFEEVENTS1_17_17', 'QS3_4_LIFEEVENTS1_18_18',
'QS3_4_LIFEEVENTS1_19_19', 'QS3_4_LIFEEVENTS1_20_20', 'QS4_6_DISAPPOINTED',
'QS4_13_LIFEEVE1_1_1', 'QS4_13_LIFEEVE1_2_2', 'QS4_13_LIFEEVE1_3_3',
'QS4_13_LIFEEVE1_4_4', 'QS4_13_LIFEEVE1_5_5', 'QS4_13_LIFEEVE1_6_6',
'QS4_16_FORMALVOL', 'QS4_26_INTERNETC',
'QS4_28_INTERNETCON', 'QS4_31_MOBILECONNE', 'QS4_33_MOBILECONNECT')
cat_unsure_pnts_3 <- c("QS4_1_MEANINGFULPERSON",
"QS4_21_MENTORING",
"QS4_29_PRIVATECONN"
) # >=3 is no/unsure/pnts
cat_unsure_pnts_4 <- c("QS4_22_PASTMENTO",
"QS2_17_TYPE_1",
"QS2_17_TYPE_2",
"QS2_17_TYPE_3",
"QS2_18_LOCATION_1",
"QS2_18_LOCATION_2",
"QS2_18_LOCATION_3",
"QS4_30_INTERNETCON") # >=4 is unsure/pnts
cat_unsure_pnts_11 <- c("QS4_4_EDUCATIONALEXPEC") # >=X11 is unsure/pnts
cat_X0_no <- c("QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4")
# *** also need to remove QS4_30_INTERNETCON_X2 independently
Variables listed in
cat_unsure_pnts_3_or_morecontainUnsureand/orPrefer not to sayat encodingUnsure = 3andPrefer not to say = 4.Variables listed in
cat_unsure_pnts_4containUnsureand/orPrefer not to sayat encodingUnsure = 4and/orPrefer not to say = 5.Variables listed in
cat_unsure_pnts_11containUnsureand/orPrefer not to sayat encodingUnsure = 11andPrefer not to say = 12.Internet access variables listed in
cat_internet_accesscontainUnsureand/orPrefer not to sayat encodingUnsure = 3andPrefer not to say = 4.
# Removal of cat_unsure_pnts_3_or_more (train)
mean_mode_imputed_train <- mean_mode_imputed_train |>
select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
mean_mode_imputed_train <- mean_mode_imputed_train |>
select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
remaining_cols <- colnames(mean_mode_imputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_train))]
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
} else {
message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
} [1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of ohe_unimputed_train (train)
ohe_unimputed_train <- ohe_unimputed_train |>
select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
ohe_unimputed_train <- ohe_unimputed_train |>
select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
remaining_cols <- colnames(ohe_unimputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_train))]
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
} else {
message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
} [1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of cat_unsure_pnts_3_or_more (test)
mean_mode_imputed_test <- mean_mode_imputed_test |>
select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
mean_mode_imputed_test <- mean_mode_imputed_test |>
select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
remaining_cols <- colnames(mean_mode_imputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_test))]
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
} else {
message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
} [1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of ohe_unimputed_train (test)
ohe_unimputed_test <- ohe_unimputed_test |>
select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
ohe_unimputed_test <- ohe_unimputed_test |>
select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
remaining_cols <- colnames(ohe_unimputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_test))]
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
} else {
message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
} [1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
7.1.1 Remove Possible Dependent Variables of Interest
Data leakage is the idea of inadvertently disclosing (or leaking) information about the dependent outcome variables (or the testing data set) to predictor variables in the training set. If we allow the outcome variables to exert antecedent influences on their predictors, and subsequently use the very same predictors to predict those outcome variables, we would end up in a circular reasoning fallacy, and this can lead to falsely inflated relationship between predictors and outcomes.
To avoid data leakage, we need to carefully remove all possible dependent variables we might use as outcomes, especially BEFORE we perform KNN imputation.
- KNN imputation assess the relationship between variables to approximate missing values. If dependent variables (DVs) are included, imputed values for the independent variables (IVs) will be influenced by the DVs, leading to biased relationship between the IV and the DV.
Decision: We will passthrough the possible DVs during the KNN imputation pipeline in python.
ohe_unimputed_train |>
select(starts_with(c('QS1_19_HIGHSCHOOL', 'QS4_19_CURRENTME', 'QS4_18_CURRENTOR',
'QS1_20_HIGHSCHOOL', 'QS1_21_FURTHEDUCA', 'QS1_22_HIGHESTEDU',
'QS1_23_YEARCOMPLE', 'QS1_25_EMPLOYMENT', 'QS1_26_EMPLOYMENT',
'QS1_27_PLANNINGRE', 'QS1_28_EMPLOYMENT', 'QS4_4_EDUCATIONALEXPEC',
'QS4_5_SATEDU','QS4_6_DISAPPOINTED', # these are self-motivation variable.
'QS4_7_SOCIALCAPITAL', 'QS4_8_HELPSEEKING', 'QS4_9_MENTALHEALTH',
'QS4_10_MENTALWELLBE', 'QS4_11_BELONGING', 'QS4_12_TRUST',
'QS4_16_FORMALVOL', 'QS4_25_FUTUREMEN', 'QS4_21_MENTORING',
'QS4_17_SERVEDASM', 'QS4_18_CURRENTOR', 'QS4_22_PASTMENTO',
'QS4_1_MEANINGFULPERSON', 'QS4_13_LIFEEVE'))) |>
summarize(across(everything(), ~sum(is.na(.)))) |>
glimpse()8 Data frame export:
8.0.0.1 Export train/test data sets
As comma separated file:
# write_csv(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.csv")
# write_csv(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.csv")
# write_csv(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.csv")
#
# write_csv(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.csv")
# write_csv(mean_mode_impute_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.csv")As R object (to retain the transformed data types)
# write_rds(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.Rds")
# write_rds(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.Rds")
# write_rds(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.Rds")- Alternative option: Try use
Featureto preserve transformed data type:
write_feather(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.feather")
write_feather(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
write_feather(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather")
write_feather(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.feather")
write_feather(mean_mode_imputed_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.feather")
write_feather(ohe_unimputed_test, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_test.feather")